suppressPackageStartupMessages(library(gapminder))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(DT))
suppressPackageStartupMessages(library(forcats))
suppressPackageStartupMessages(library(tibble))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggrepel))

Exercise 1

Using the “here” package creates relative paths that allow for better sharing capability, whereas using the baseR directory path makes the path set to only your working directory on your local drive.The here package does what other packages can do but in a more simple, platform-independant way.The here package also allows for the user to not need to set the working directory, which allows for furthur ease in collabortaion. When using a document on github, this can also be completed using the here package, allowing for the path to be connected to github directly, rather than an absolute path on your harddrive.

Exercise 2

2.1 - Drop Oceania

First, I’ll explore the factors within the gapminder dataset to see what we’re working with.

gapminder$continent %>%
  levels()
## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"
str(gapminder$continent)
##  Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
gapminder %>%
  DT::datatable()

Now we’ll drop Oceana as a factor.

the_north <- c("Africa", "Americas", "Asia", "Europe")
we_the_north <- gapminder %>%
  filter(continent %in% the_north)

And now I’ll re-explore our data.

we_the_north$continent %>%
  levels()
## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"
str(we_the_north$continent)
##  Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...

Since the factors Oceania is still shown, we need to specify to drop the level(factor) that we filtered out.

we_the_NORTH <- we_the_north %>%
  droplevels()

And now I’ll (re-)re-explore our data.

we_the_NORTH$continent %>%
  levels()
## [1] "Africa"   "Americas" "Asia"     "Europe"
str(we_the_NORTH$continent)
##  Factor w/ 4 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...

2.2 - Reorder levels

we_the_NORTH %>%
  count(continent)
## # A tibble: 4 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360

Re-order levels based on ascending # of values from top to bottom of plot

we_the_NORTH %>%
  filter (lifeExp > 50) %>%
  ggplot() +
   geom_bar(aes(fct_infreq(continent))) +
  coord_flip() +
  theme_bw() +
  labs(y = "# of Observations", x = "Continent", title = "Occurence of lifeExp higher than 50") 

Exercise 3

3.1 - Export then import dataset

new_data <- we_the_NORTH %>%
  filter (lifeExp > 50)

write_csv(new_data, here::here("new_data.csv"))

new_data2 <- read_csv(here::here("new_data.csv"))
## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )
datatable(new_data2)

Explore new data, what happened?

new_data %>%
  levels()
## NULL
str(new_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1189 obs. of  6 variables:
##  $ country  : Factor w/ 140 levels "Afghanistan",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ continent: Factor w/ 4 levels "Africa","Americas",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  55.2 59.3 64.8 66.2 67.7 ...
##  $ pop      : int  1282697 1476505 1728137 1984060 2263554 2509048 2780097 3075321 3326498 3428038 ...
##  $ gdpPercap: num  1601 1942 2313 2760 3313 ...
new_data2 %>%
  levels()
## NULL
str(new_data2)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1189 obs. of  6 variables:
##  $ country  : chr  "Albania" "Albania" "Albania" "Albania" ...
##  $ continent: chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ year     : num  1952 1957 1962 1967 1972 ...
##  $ lifeExp  : num  55.2 59.3 64.8 66.2 67.7 ...
##  $ pop      : num  1282697 1476505 1728137 1984060 2263554 ...
##  $ gdpPercap: num  1601 1942 2313 2760 3313 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country = col_character(),
##   ..   continent = col_character(),
##   ..   year = col_double(),
##   ..   lifeExp = col_double(),
##   ..   pop = col_double(),
##   ..   gdpPercap = col_double()
##   .. )

The new data has dropped both country and continent data as factors, and now has them stored as characters. I must re-assign country and continent as factors!

new_data2$country <- factor(new_data2$country)
new_data2$continent <- factor(new_data2$continent)

Plot the re-assigned factors in a plot.

new_data2 %>%
  ggplot() +
  geom_bar(aes(fct_rev(continent), colour = continent)) +
  coord_flip() +
  theme_bw() +
  labs(x= "# of Observations", y = "Continent")

Order by population size:

new_countries <- c("Canada", "Albania", "Taiwan")
new_factor <- new_data2 %>%
  filter(country %in% new_countries)

new_factor %>%
  ggplot() +
  geom_boxplot(aes(x=fct_reorder(country, pop),y=lifeExp, fill = country)) +
  theme_bw() +
  labs(x= "Country", y = "Life Expectancy", title = "Countries ordered by decreasing lifeExp")

Exercise 4

(Rwanda <- gapminder%>%
  filter(country == "Rwanda") %>%
    mutate(lifeExp_inc= lifeExp - lag(lifeExp)) %>%
    mutate(GDP = gdpPercap*pop) %>%
    arrange(year))
## # A tibble: 12 x 8
##    country continent  year lifeExp     pop gdpPercap lifeExp_inc        GDP
##    <fct>   <fct>     <int>   <dbl>   <int>     <dbl>       <dbl>      <dbl>
##  1 Rwanda  Africa     1952    40   2534927      493.      NA         1.25e9
##  2 Rwanda  Africa     1957    41.5 2822082      540.       1.5       1.52e9
##  3 Rwanda  Africa     1962    43   3051242      597.       1.5       1.82e9
##  4 Rwanda  Africa     1967    44.1 3451079      511.       1.1       1.76e9
##  5 Rwanda  Africa     1972    44.6 3992121      591.       0.5       2.36e9
##  6 Rwanda  Africa     1977    45   4657072      670.       0.400     3.12e9
##  7 Rwanda  Africa     1982    46.2 5507565      882.       1.22      4.86e9
##  8 Rwanda  Africa     1987    44.0 6349365      848.      -2.20      5.38e9
##  9 Rwanda  Africa     1992    23.6 7290203      737.     -20.4       5.37e9
## 10 Rwanda  Africa     1997    36.1 7212583      590.      12.5       4.26e9
## 11 Rwanda  Africa     2002    43.4 7852401      786.       7.33      6.17e9
## 12 Rwanda  Africa     2007    46.2 8860588      863.       2.83      7.65e9
Rwanda %>%
  ggplot(aes(GDP, lifeExp_inc)) +
      geom_point() +
  scale_x_log10(labels = scales::comma_format()) +
  geom_text(aes(label=year),hjust=0, vjust=1)

#first plot
plot1 <- (Rwanda %>%
  ggplot(aes(GDP, lifeExp_inc)) +
    geom_point() +
    geom_text(aes(label=year),hjust=0, vjust=0))


#better plot
plot2 <- (Rwanda %>%
  ggplot(aes(GDP, lifeExp_inc, label= year)) +
      geom_point() +
      scale_x_log10(labels = scales::comma_format()) +
      geom_text_repel() +
      labs(y= " Change in life expectancy", x = "Gross GDP", title = "Rwandan census and productivity data") +
      theme_bw())

#put both plots side-by-side
grid.arrange(plot1, plot2, nrow = 1, top = "Before and After Data Visualization")

plot3 <- ggplot(Rwanda, aes(x=GDP, y= lifeExp_inc, colour = year)) +
  geom_point() +
  labs(title = "Rwandan GDP and change in life expectancy (1957-2007)")

plot3 %>% 
  ggplotly()

##Exercise 5

ggsave(filename = "Betterplot", plot = plot2, device = "jpg", path = here::here())
## Saving 7 x 5 in image